# Creating the entry point for spark
from pyspark import SparkContext,SQLContext
sc = SparkContext()
sqlContext = SQLContext(sc)
from pyspark.sql import SparkSession
sparkSession = SparkSession.builder.getOrCreate()
# Importing the below modules for data manipulation
from pyspark.sql.functions import col, countDistinct,isnan, when, count, unix_timestamp, to_date, from_unixtime, substring,date_format
from pyspark.sql import types as t
# To enable plotting graphs in Jupyter notebook
import numpy as np
# to handle data in form of rows and columns
import pandas as pd
# importing ploting libraries
import matplotlib.pyplot as plt
from scipy.stats import itemfreq
import seaborn as sns
from sklearn.model_selection import train_test_split
%matplotlib inline
from datetime import datetime
from sklearn import preprocessing
import statsmodels.api as sm
import statsmodels.formula.api as smf
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, mean_absolute_error, mean_squared_error
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
import graphviz
from sklearn.tree import export_graphviz
# special matplotlib argument for improved plots
from matplotlib import rcParams
sns.set_style("whitegrid")
sns.set_context("poster")
# Hypothesis
# Oil price plays a role in the total number of sales transactions & sales
# Loading Merged train daily transactions for store 44
sdfTrain = sqlContext.read.csv("/gl-capstone-data/Team6-C-Sep/Data/train_transactions_daily_store44.csv",header = True,inferSchema = True)
sdfTrain.printSchema()
sdfTrain.count()
print(sdfTrain.show(5))
sqlContext.registerDataFrameAsTable(sdfTrain, "sdfTrainTbl")
Merge Train and Oil:
sdfOil = sqlContext.read.csv("/gl-capstone-data/Team6-C-Sep/Data/oil_2013_2015_interpolated.csv",header = True,inferSchema = True)
sdfOil.printSchema()
sdfOil.count()
minRow = sdfOil.agg({"date": "min"}).collect()[0]
print minRow["min(date)"]
maxRow = sdfOil.agg({"date": "max"}).collect()[0]
print maxRow["max(date)"]
sqlContext.registerDataFrameAsTable(sdfOil, "sdfOilTbl")
# Merge Training dataset with interpolated oil dataset
sdfTrainWithOil = sdfTrain.join(sdfOil, ["date"],"leftouter")
print(sdfTrainWithOil.show(20))
sdfTrainWithOil.count()
Merge Train and Holiday_events:
sdfHolidayEvents = sqlContext.read.csv("/gl-capstone-data/Team6-C-Sep/Data/holidays_events_2013_2015_noduplicates.csv",header = True,inferSchema = True)
print(sdfHolidayEvents.show(5))
sdfHolidayEvents.count()
sqlContext.registerDataFrameAsTable(sdfHolidayEvents, "sdfHolidayEvents")
sdfHolidayEvents.printSchema()
sdfTrainWithOilHolidayEventsJoined = sdfTrainWithOil.join(sdfHolidayEvents, ["date"],"leftouter")
sqlContext.registerDataFrameAsTable(sdfTrainWithOilHolidayEventsJoined, "sdfTrainWithOilHolidayEventsJoined")
sqlContext.sql("select count(*) from sdfTrainWithOilHolidayEventsJoined").show()
minRow_Train = sdfTrainWithOilHolidayEventsJoined.agg({"date": "min"}).collect()[0]
print minRow_Train["min(date)"]
maxRow_Train = sdfTrainWithOilHolidayEventsJoined.agg({"date": "max"}).collect()[0]
print maxRow_Train["max(date)"]
#date_mask = (sdfTrainWithOilHolidayEventsJoined['date'] >= '2015-01-01') & (sdfTrainWithOilHolidayEventsJoined['date'] <= '2015-12-31')
pd_train = sdfTrainWithOilHolidayEventsJoined.toPandas()
#Print the size
len(pd_train)
pd_train.head(5)
pd_train_nan = (pd_train.isnull().sum() / pd_train.shape[0]) * 100
pd_train_nan
# There are 86% of Nulls/NA in Holiday events attributes. Replacing Nulls or NA with No_Holiday as the default value
pd_train['type'] = pd_train.type.replace(np.NaN, 'No_Holiday')
pd_train['locale'] = pd_train.locale.replace(np.NaN, 'None')
pd_train['locale_name'] = pd_train.locale_name.replace(np.NaN, 'None')
pd_train['description'] = pd_train.description.replace(np.NaN, 'None')
pd_train['transferred'] = pd_train.transferred.replace(np.NaN, 'None')
# Rechecking for Nulls, No Nulls now
pd_train_nan = (pd_train.isnull().sum() / pd_train.shape[0]) * 100
pd_train_nan
# Inference
# 1089 Observations, 10 features
# Descriptive statistics reveal: average transaction volume is 4133 between 2013 - 2015 & Monthly average sales is $30586 for store 44 for the period 2013 - 2015
# Minimum transaction volume is 2333 & Maximum transaction volume is 8359.Min sales is $9067 and Max sales is $78070
# Oil price range from $34 to $110
#Shape
print('Shape : ', pd_train.shape, '\n')
#Type
print('Type : ', '\n', pd_train.dtypes)
#Summary
pd_train.describe()
pd_train.sample(10)
pd_train.type.unique()
pd_train.description.unique()
pd_train.locale.unique()
pd_train.transferred.unique()
# Formatting date to YYYY-MM-DD
pd_train['date']=pd_train['date'].apply(lambda x: x.strftime('%Y-%m-%d'))
pd_train['date'].sample(10)
pd_train.sample(10)
# Reformat the date - Get Month Year
def get_month_year(df):
df['month'] = df.date.apply(lambda x: x.split('-')[1])
df['year'] = df.date.apply(lambda x: x.split('-')[0])
return df
get_month_year(pd_train);
pd_train['date'] = pd.to_datetime(pd_train['date'])
pd_train['day'] = pd_train['date'].dt.weekday_name
pd_train = pd_train.drop('date', axis=1)
pd_train.sample(10)
dummy_variables = ['type','store_nbr','locale', 'locale_name','transferred', 'month', 'day']
for var in dummy_variables:
dummy = pd.get_dummies(pd_train[var], prefix = var, drop_first = False)
pd_train = pd.concat([pd_train, dummy], axis = 1)
pd_train = pd_train.drop(dummy_variables, axis = 1)
pd_train = pd_train.drop(['year'], axis = 1)
pd_train=pd_train.drop("description",axis=1)
pd_train.sample(10)
sns.regplot(x='dcoilwtico',
y='transactions',
data=pd_train,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'})
sns.regplot(x='dcoilwtico',
y='unit_sales',
data=pd_train,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'})
#Re-scale Sales, Transactions, Oil Price using the standard scaler
scaler = preprocessing.StandardScaler()
pd_train['unit_sales'] = scaler.fit_transform(pd_train['unit_sales'].reshape(-1,1))
pd_train['dcoilwtico'] = scaler.fit_transform(pd_train['dcoilwtico'].reshape(-1,1))
pd_train['transactions'] = scaler.fit_transform(pd_train['transactions'].reshape(-1,1))
pd_train.sample(10)
print('Shape : ', pd_train.shape)
# The features highly correlated with transactions is Unit sales, Remove unit sales from the dependent variables.
# As the number of sales increases, total sales also increases and vice versa. Hence they are highly correlated and sales can be removed.
import matplotlib.pyplot as plt
corr = pd_train.corr()
corr.style.background_gradient()
plt.figure(figsize=(32,32))
plt.matshow(pd_train.corr(), cmap=plt.cm.Reds, fignum=1)
plt.colorbar()
tick_marks = [i for i in range(len(pd_train.columns))]
plt.xticks(tick_marks, pd_train.columns, rotation=90)
plt.yticks(tick_marks, pd_train.columns)
# Keeping only the highly correlated features for plotting the distribution
pd_train_filtered = pd_train[['transactions','dcoilwtico','unit_sales','type_Additional','month_12','day_Saturday','day_Sunday']]
# Check the data distribution through pairplot
sns.pairplot(pd_train_filtered)
sns.regplot(x='dcoilwtico',
y='transactions',
data=pd_train,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'})
sns.regplot(x='dcoilwtico',
y='unit_sales',
data=pd_train,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'})
sns.regplot(x='transactions',
y='unit_sales',
data=pd_train,
scatter_kws={'alpha':0.3},
line_kws={'color':'black'})
Is there any statistically significant relation between Oil price and Transaction Volume for the store 44 ?
Null Hypothesis H0 = Oil price and Transaction Volume are independent from each other.
Alternative Hypothesis HA = Oil price and Transaction Volume are not independent of each other. There is a relationship between them.
Oil Price - Independent continuous variable
Transaction Volume - Dependent continuous variable
lin_model = smf.ols(formula = 'transactions ~ dcoilwtico', data = pd_train).fit()
#print the summary
print(lin_model.summary())
# Inference
# p-value is < 0.05 and hence there is relationship between transaction volume and oil price.
# So, rejecting the null hypothesis
Is there any statistically significant relation between Oil price and Sales for the store 44 ?
Null Hypothesis H0 = Oil price and sales are independent from each other.
Alternative Hypothesis HA = Oil price and sales are not independent of each other. There is a relationship between them.
Oil Price - Independent continuous variable
Sales - Dependent continuous variable
lin_model = smf.ols(formula = 'unit_sales ~ dcoilwtico', data = pd_train).fit()
#print the summary
print(lin_model.summary())
# Inference
# p-value is < 0.05 and hence there is a relationship between oil price and sales.
# So, rejecting the null hypothesis
# Modeling for Transactions as the dependent variable and all the other variables from the filtered dataframe except total sales as the independent variables.
X_train = pd_train.drop(['unit_sales','transactions'], axis = 1)
y_labels = pd_train['transactions']
num_test = 0.35
X_train, X_test, y_train, Y_test = train_test_split(X_train, y_labels, test_size=num_test, random_state=15)
print('X_train shape :', X_train.shape)
print('y_train shape :', y_train.shape)
print('X_test shape :', X_test.shape)
print('y_test shape :', Y_test.shape)
regression_model = LinearRegression()
regression_model.fit(X_train.as_matrix(), y_train)
print(regression_model.coef_)
print(regression_model.intercept_)
# Checking the coefficients for each of the independent attributes
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))
# Let us check the intercept for the model
intercept = regression_model.intercept_
print("The intercept for the model is {}".format(intercept))
regression_model.score(X_test, Y_test)
# checking the sum of squared errors by predicting value of y for test cases and
# subtracting from the actual y for the test cases
mse = np.mean((regression_model.predict(X_test)-Y_test)**2)
# underroot of mean_sq_error is standard deviation i.e. avg variance between predicted and actual
import math
math.sqrt(mse)
# predict transaction volume for a set of attributes
y_pred = regression_model.predict(X_test)
# Since this is regression, plot the predicted y value vs actual y values for the test data
plt.scatter(Y_test, y_pred)
from sklearn.metrics import r2_score
R2_Lin_Reg_Tran_Pred = r2_score(Y_test, y_pred)
print(R2_Lin_Reg_Tran_Pred)
y_train_pred = regression_model.predict(X_train)
# Print the results of MAE (mean absolute error), MSE (Mean squared Error), RMSE (Root Mean squared error) - Train data
print(mean_absolute_error(y_train, y_train_pred))
print(mean_squared_error(y_train, y_train_pred))
print(np.sqrt(mean_squared_error(y_train, y_train_pred)))
RMSE_Lin_Reg_Tran_Pred = np.sqrt(mean_squared_error(Y_test, y_pred))
# Print the results of MAE (mean absolute error), MSE (Mean squared Error), RMSE (Root Mean squared error) - Test data
print("The mean absolute error of simple linear regression model is : " + str(mean_absolute_error(Y_test, y_pred)))
print("The mean squared error of simple linear regression model is : " + str(mean_squared_error(Y_test, y_pred)))
print("The root mean squared error of simple linear regression model is : " + str(RMSE_Lin_Reg_Tran_Pred))
# Criterion : Mean absolute error
dtree = DecisionTreeRegressor(random_state=0, criterion="mae")
dtree_fit = dtree.fit(X_train, y_train)
dtree_scores = cross_val_score(dtree_fit, X_train, y_train, cv = 5)
print("mean cross validation score: {}".format(np.mean(dtree_scores)))
print("score without cv: {}".format(dtree_fit.score(X_train, y_train)))
# on the test or hold-out set
from sklearn.metrics import r2_score
y_pred = dtree_fit.predict(X_test)
R2_DT_WoutGCV_Tran_Pred = r2_score(Y_test, y_pred)
print(R2_DT_WoutGCV_Tran_Pred)
print(dtree_fit.score(X_test, Y_test))
final_mae = mean_absolute_error(Y_test, y_pred)
final_mse = mean_squared_error(Y_test, y_pred)
final_rmse = np.sqrt(final_mse)
RMSE_DT_WoutGCV_Tran_Pred = final_rmse
# Print the results of MAE (mean absolute error), MSE (Mean squared Error), RMSE (Root Mean squared error) - Test data
print("The mean absolute error of decision tree without gridsearchcv is : " + str(final_mae))
print("The mean squared error of decision tree without gridsearchcv is : " + str(final_mse))
print("The root mean squared error of decision tree without gridsearchcv is : " + str(final_rmse))
scoring = make_scorer(r2_score)
g_cv = GridSearchCV(DecisionTreeRegressor(random_state=0),
param_grid={'min_samples_split': range(2, 10)},
scoring=scoring, cv=5, refit=True)
g_cv.fit(X_train, y_train)
result = g_cv.cv_results_
# print(result)
R2_DT_WithGCV_Tran_Pred = r2_score(Y_test, g_cv.best_estimator_.predict(X_test))
print(R2_DT_WithGCV_Tran_Pred)
y_pred = g_cv.best_estimator_.predict(X_test)
final_mae = mean_absolute_error(Y_test, y_pred)
final_mse = mean_squared_error(Y_test, y_pred)
final_rmse = np.sqrt(final_mse)
RMSE_DT_WithGCV_Tran_Pred = final_rmse
# Print the results of MAE (mean absolute error), MSE (Mean squared Error), RMSE (Root Mean squared error) - Test data
print("The mean absolute error of decision tree with gridsearchcv is : " + str(final_mae))
print("The mean squared error of decision tree with gridsearchcv is : " + str(final_mse))
print("The root mean squared error of decision tree with gridsearchcv is : " + str(final_rmse))
print("The best params from gridsearchcv are :" + str(g_cv.best_params_))
print("The best estimators from gridsearchcv are :" + str(g_cv.best_estimator_))
# Printing the decision tree
dot_data = export_graphviz(dtree_fit, out_file=None)
graph = graphviz.Source(dot_data)
dot_data = export_graphviz(dtree_fit, out_file=None,
feature_names=X_train.columns,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
# Inference:
# Oil price is the main predictor value using which decision tree is built and is an influencing factor in the total transaction volume.
# Choose the type of classifier.
RFR = RandomForestRegressor()
# Choose some parameter combinations to try
parameters = {'n_estimators': [5, 10, 100],
'criterion': ['mse','mae'],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1,5]
}
# Type of scoring used to compare parameter combinations
# Run the grid search
grid_obj = GridSearchCV(RFR, parameters,
cv=5, #Determines the cross-validation splitting strategy /to specify the number of folds in a (Stratified)KFold
n_jobs=-1, #Number of jobs to run in parallel
verbose=1)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
RFR = grid_obj.best_estimator_
# Fit the best algorithm to the data.
RFR.fit(X_train, y_train)
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
predictions = RFR.predict(X_test)
R2_RF_WithGCV_Tran_Pred = r2_score(Y_test, predictions)
print('R2 score = ',R2_RF_WithGCV_Tran_Pred, '/ 1.0')
print("The mean absolute error of random forest with gridsearchcv is : " + str(mean_absolute_error(Y_test, predictions)))
print("The mean squared error of random forest with gridsearchcv is : " + str(mean_squared_error(Y_test, predictions)))
print("The root mean squared error of random forest with gridsearchcv is : " + str(np.sqrt(mean_squared_error(Y_test, predictions))))
RMSE_RF_WithGCV_Tran_Pred = np.sqrt(mean_squared_error(Y_test, predictions))
#Check and plot the 500 first predictions
plt.plot(Y_test.as_matrix()[0:500], '+', color ='red', alpha=0.7)
plt.plot(predictions[0:500], 'ro', color ='green', alpha=0.5)
plt.show()
# Modeling for Total sales as the dependent variable and all the other variables except transactions as the independent variables.
X_train = pd_train.drop(['unit_sales','transactions'], axis = 1)
y_labels = pd_train['unit_sales']
num_test = 0.20
X_train, X_test, y_train, Y_test = train_test_split(X_train, y_labels, test_size=num_test, random_state=15)
print('X_train shape :', X_train.shape)
print('y_train shape :', y_train.shape)
print('X_test shape :', X_test.shape)
print('Y_test shape :', Y_test.shape)
regression_model = LinearRegression()
regression_model.fit(X_train.as_matrix(), y_train)
print(regression_model.coef_)
print(regression_model.intercept_)
# Checking the coefficients for each of the independent attributes
for idx, col_name in enumerate(X_train.columns):
print("The coefficient for {} is {}".format(col_name, regression_model.coef_[idx]))
# Let us check the intercept for the model
intercept = regression_model.intercept_
print("The intercept for the model is {}".format(intercept))
regression_model.score(X_test, Y_test)
# checking the sum of squared errors by predicting value of y for test cases and
# subtracting from the actual y for the test cases
mse = np.mean((regression_model.predict(X_test)-Y_test)**2)
# underroot of mean_sq_error is standard deviation i.e. avg variance between predicted and actual
import math
math.sqrt(mse)
# predict transaction volume for a set of attributes
y_pred = regression_model.predict(X_test)
# Since this is regression, plot the predicted y value vs actual y values for the test data
plt.scatter(Y_test, y_pred)
from sklearn.metrics import r2_score
R2_Lin_Reg_Sale_Pred = r2_score(Y_test, y_pred)
print(R2_Lin_Reg_Sale_Pred)
y_train_pred = regression_model.predict(X_train)
RMSE_Lin_Reg_Sale_Pred = np.sqrt(mean_squared_error(y_train, y_train_pred))
# Print the results of MAE (mean absolute error), MSE (Mean squared Error), RMSE (Root Mean squared error) - Train data
print(mean_absolute_error(y_train, y_train_pred))
print(mean_squared_error(y_train, y_train_pred))
print(RMSE_Lin_Reg_Sale_Pred)
# Print the results of MAE (mean absolute error), MSE (Mean squared Error), RMSE (Root Mean squared error) - Test data
print("The mean absolute error of simple linear regression model is : " + str(mean_absolute_error(Y_test, y_pred)))
print("The mean squared error of simple linear regression model is : " + str(mean_squared_error(Y_test, y_pred)))
print("The root mean squared error of simple linear regression model is : " + str(np.sqrt(mean_squared_error(Y_test, y_pred))))
# Criterion : Mean absolute error
dtree = DecisionTreeRegressor(random_state=0, criterion="mae")
dtree_fit = dtree.fit(X_train, y_train)
dtree_scores = cross_val_score(dtree_fit, X_train, y_train, cv = 5)
print("mean cross validation score: {}".format(np.mean(dtree_scores)))
print("score without cv: {}".format(dtree_fit.score(X_train, y_train)))
y_pred = dtree_fit.predict(X_test)
# on the test or hold-out set
from sklearn.metrics import r2_score
R2_DT_WoutGCV_Sale_Pred = r2_score(Y_test,y_pred)
print(R2_DT_WoutGCV_Sale_Pred)
print(dtree_fit.score(X_test, Y_test))
final_mae = mean_absolute_error(Y_test, y_pred)
final_mse = mean_squared_error(Y_test, y_pred)
final_rmse = np.sqrt(final_mse)
RMSE_DT_WoutGCV_Sale_Pred = final_rmse
# Print the results of MAE (mean absolute error), MSE (Mean squared Error), RMSE (Root Mean squared error) - Test data
print("The mean absolute error of decision tree without gridsearchcv is : " + str(final_mae))
print("The mean squared error of decision tree without gridsearchcv is : " + str(final_mse))
print("The root mean squared error of decision tree without gridsearchcv is : " + str(final_rmse))
scoring = make_scorer(r2_score)
g_cv = GridSearchCV(DecisionTreeRegressor(random_state=0),
param_grid={'min_samples_split': range(2, 10)},
scoring=scoring, cv=5, refit=True)
g_cv.fit(X_train, y_train)
g_cv.best_params_
result = g_cv.cv_results_
# print(result)
R2_DT_WithGCV_Sale_Pred = r2_score(Y_test, g_cv.best_estimator_.predict(X_test))
print(R2_DT_WithGCV_Sale_Pred)
y_pred = g_cv.best_estimator_.predict(X_test)
final_mae = mean_absolute_error(Y_test, y_pred)
final_mse = mean_squared_error(Y_test, y_pred)
final_rmse = np.sqrt(final_mse)
RMSE_DT_WithGCV_Sale_Pred=final_rmse
# Print the results of MAE (mean absolute error), MSE (Mean squared Error), RMSE (Root Mean squared error) - Test data
print("The mean absolute error of decision tree with gridsearchcv is : " + str(final_mae))
print("The mean squared error of decision tree with gridsearchcv is : " + str(final_mse))
print("The root mean squared error of decision tree with gridsearchcv is : " + str(final_rmse))
print("The best params from gridsearchcv are :" + str(g_cv.best_params_))
print("The best estimators from gridsearchcv are :" + str(g_cv.best_estimator_))
# Printing the decision tree
dot_data = export_graphviz(dtree_fit, out_file=None)
graph = graphviz.Source(dot_data)
dot_data = export_graphviz(dtree_fit, out_file=None,
feature_names=X_train.columns,
filled=True, rounded=True,
special_characters=True)
graph = graphviz.Source(dot_data)
graph
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import GridSearchCV
# Choose the type of classifier.
RFR = RandomForestRegressor()
# Choose some parameter combinations to try
#YOU CAN TRY DIFFERENTS PARAMETERS TO FIND THE BEST MODEL
parameters = {'n_estimators': [5, 10, 100],
'criterion': ['mse','mae'],
'max_depth': [5, 10, 15],
'min_samples_split': [2, 5, 10],
'min_samples_leaf': [1,5]
}
# Type of scoring used to compare parameter combinations
# Run the grid search
grid_obj = GridSearchCV(RFR, parameters,
cv=5, #Determines the cross-validation splitting strategy /to specify the number of folds in a (Stratified)KFold
n_jobs=-1, #Number of jobs to run in parallel
verbose=1)
grid_obj = grid_obj.fit(X_train, y_train)
# Set the clf to the best combination of parameters
RFR = grid_obj.best_estimator_
# Fit the best algorithm to the data.
RFR.fit(X_train, y_train)
print("The best params from gridsearchcv are :" + str(grid_obj.best_params_))
print("The best estimators from gridsearchcv are :" + str(grid_obj.best_estimator_))
from sklearn.metrics import r2_score
from sklearn.metrics import mean_squared_error
predictions = RFR.predict(X_test)
R2_RF_WithGCV_Sale_Pred = r2_score(Y_test, predictions)
print('R2 score = ',R2_RF_WithGCV_Sale_Pred, '/ 1.0')
print('MSE score = ',mean_squared_error(Y_test, predictions), '/ 0.0')
RMSE_RF_WithGCV_Sale_Pred = np.sqrt(mean_squared_error(Y_test, predictions))
print("The mean absolute error of random forest with gridsearchcv is : " + str(mean_absolute_error(Y_test, predictions)))
print("The mean squared error of random forest with gridsearchcv is : " + str(mean_squared_error(Y_test, predictions)))
print("The root mean squared error of random forest with gridsearchcv is : " + str(RMSE_RF_WithGCV_Sale_Pred))
#Check and plot the 500 first predictions
plt.plot(Y_test.as_matrix()[0:500], '+', color ='blue', alpha=0.7)
plt.plot(predictions[0:500], 'ro', color ='red', alpha=0.5)
plt.show()
Transaction Prediction: Different Models: Metrics : R-squared </u>
print("Linear regression : R2 score : " + str(R2_Lin_Reg_Tran_Pred))
print("Decision Tree without GridsearchCV : R2 score : " + str(R2_DT_WoutGCV_Tran_Pred))
print("Decision Tree with GridsearchCV : R2 score : " + str(R2_DT_WithGCV_Tran_Pred))
print("Random Forest with GridsearchCV : R2 score : "+ str(R2_RF_WithGCV_Tran_Pred))
Sales Prediction: Different Models: Metrics: R-Squared</u>
print("Linear regression : R2 score : " + str(R2_Lin_Reg_Sale_Pred))
print("Decision Tree without GridsearchCV : R2 score : " + str(R2_DT_WoutGCV_Sale_Pred))
print("Decision Tree with GridsearchCV : R2 score : " + str(R2_DT_WithGCV_Sale_Pred))
print("Random Forest with GridsearchCV : R2 score : "+ str(R2_RF_WithGCV_Sale_Pred))
Transaction Prediction: Different Models: Metrics : RMSE </u>
print("Linear regression : RMSE : " + str(RMSE_Lin_Reg_Tran_Pred))
print("Decision Tree without GridsearchCV : RMSE : " + str(RMSE_DT_WoutGCV_Tran_Pred))
print("Decision Tree with GridsearchCV : RMSE : " + str(RMSE_DT_WithGCV_Tran_Pred))
print("Random Forest with GridsearchCV : RMSE : "+ str(RMSE_RF_WithGCV_Tran_Pred))
Sales Prediction: Different Models: Metrics: RMSE </u>
print("Linear regression : RMSE : " + str(RMSE_Lin_Reg_Tran_Pred))
print("Decision Tree without GridsearchCV : RMSE : " + str(RMSE_DT_WoutGCV_Tran_Pred))
print("Decision Tree with GridsearchCV : RMSE : " + str(RMSE_DT_WithGCV_Tran_Pred))
print("Random Forest with GridsearchCV : RMSE : "+ str(RMSE_RF_WithGCV_Tran_Pred))